As an example, we use the pbmc3k data from SeuratData, with details shown in pbmc3k_tutorial. For convenience, we provide the data in the PaaSc package.
data <- Read10X(data.dir = "data/pbmc3k_filtered_gene_bc_matrices/hg19")
object <- CreateSeuratObject(counts = data, min.cells = 3, min.features = 200)
object
An object of class Seurat
13714 features across 2700 samples within 1 assay
Active assay: RNA (13714 features, 0 variable features)
1 layer present: counts
PaaSc computes pathway activity score for each pathway independently and requires a background defined as a collection of pathway in the database. In general, the pathway of interest should be part of the background. Take the B_cell_activation from Gene Ontology (GO) as an example, the background pathway sets can be downloaded from MSigDB.
# MCA
object <- computeMCA(object, nmcs = 20)
1.305 sec elapsed
18.053 sec elapsed
1.538 sec elapsed
# Building gene rate for pathway and background pathway sets
background_geneset <- "data/kegg_legacy.gmt"
B_cell_activation <- readLines("data/B_cell_activation.txt")
pathway_geneset <- list(B_cell_activation = B_cell_activation)
gene_rate <- getGeneRate(background.geneset = background_geneset, pathway.geneset = pathway_geneset)
head(gene_rate)
background B_cell_activation
ABCA1 0.005376344 0
ABCA10 0.005376344 0
ABCA12 0.005376344 0
ABCA13 0.005376344 0
ABCA2 0.010752688 0
ABCA3 0.005376344 0
# Regression of gene loading (Y) and gene rate (X)
regression_data <- doRegression(object, gene.rate = gene_rate)
head(regression_data[, 1:5])
mca_1 mca_2 mca_3
B_cell_activation.t 1.5047288 0.09987953 -3.289811947
B_cell_activation.background.t 0.3260923 0.47468205 -0.012395800
B_cell_activation.pvalue 0.1324888 0.92044599 0.001012992
B_cell_activation.background.pvalue 0.7443751 0.63504478 0.990110579
mca_4 mca_5
B_cell_activation.t -4.924049e+00 -2.724654808
B_cell_activation.background.t 6.320301e-01 -0.971835309
B_cell_activation.pvalue 8.889976e-07 0.006470545
B_cell_activation.background.pvalue 5.274106e-01 0.331203156
As shown above, mca_3,mca_4 and mca_5 are significantly associated with B_cell_activation pathway.
# Computing the pathway activity score
score_data <- computeScore(object, regression.data = regression_data, weight = FALSE, normalize = "z-score")
head(score_data)
B_cell_activation
AAACATACAACCAC-1 -0.34014887
AAACATTGAGCTAC-1 2.09314958
AAACATTGATCAGC-1 -1.10995620
AAACCGTGCTTCCG-1 0.04333788
AAACCGTGTATGCG-1 0.26093785
AAACGCACTGGTAC-1 -0.24407591
# visualization of pathway activity score using UMAP
object <- FindVariableFeatures(object, verbose = FALSE)
object <- ScaleData(object, verbose = FALSE)
object <- RunPCA(object, verbose = FALSE)
object <- FindNeighbors(object, verbose = FALSE)
object <- RunUMAP(object, dims = 1:20, verbose = FALSE)
# Ceiling score for better visualization
cscore <- score_data[colnames(object), 1]
cscore[cscore > quantile(cscore, 0.99)] = quantile(cscore, 0.99)
cscore[cscore < quantile(cscore, 0.01)] = quantile(cscore, 0.01)
object@meta.data$B_cell_activation <- cscore
FeaturePlot(object, features = "B_cell_activation", cols = c('white', 'darkred'))
Sometime, it’s helpful to binarize the pathway activity scores into different states, such as positive and negative. Here, we use Gaussian mixture model (GMM) to binarize the pathway activity scores.
# binarization
label_data <- doBinarization(score_data, n.cluster = 2, method = "GMM")
number of iterations= 10
head(label_data)
B_cell_activation.label
AAACATACAACCAC-1 negative
AAACATTGAGCTAC-1 positive
AAACATTGATCAGC-1 negative
AAACCGTGCTTCCG-1 negative
AAACCGTGTATGCG-1 negative
AAACGCACTGGTAC-1 negative
threshold <- min(score_data[rownames(label_data[label_data['B_cell_activation.label'] == 'positive', ]), 'B_cell_activation'])
ggplot(score_data, aes(x = B_cell_activation)) +
geom_histogram(bins = 50, fill = "grey", alpha = 0.7) +
geom_vline(xintercept = threshold, color = "red", linetype = "solid") +
annotate("text", x = threshold, y = 0.2, label = paste0("x=", signif(threshold, 2)), hjust = 0, vjust = 0, color = "black", size = 5) +
xlab("Activity score") + ylab("Frequency") + theme_minimal()
# visualization of cell states
object@meta.data$B_cell_activation_label <- label_data[colnames(object), ]
DimPlot(object, group.by = "B_cell_activation_label")
We may further evaluate the cell-type specificity using Area Under the Receiver Operating Characteristic Curve (AUC-ROC) value. Here, we use the pre-annotated celltype information to measure whether B_cell_activation is specific to B cells or not.
annotation_file <- "data/pbmc3k_seurat_annotation.txt"
seurat_annotations <- read.table(annotation_file, header = TRUE, row.names = 1, sep = "\t")
# Plot the pre-annotated celltype in UMAP
object@meta.data$seurat_annotations <- seurat_annotations[colnames(object), ]
DimPlot(object, group.by = "seurat_annotations")
# Distribution of B cell activation score in different cell types
score_data_withanno <- cbind(score_data, seurat_annotations)
ggplot(score_data_withanno, aes(x = seurat_annotations, y = B_cell_activation, color = seurat_annotations)) +
geom_boxplot(outlier.shape = NA) + geom_jitter(width = 0.2, size = 0.01) + theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
theme(plot.title = element_text(hjust = 0.5)) + theme(legend.title=element_blank()) +
theme(legend.position="none") + theme(axis.text.x = element_text(vjust = 1, hjust = 1, angle = 45, size = 12))
# Calculating the AUC-ROC value
label <- rep('others', nrow(score_data_withanno))
label[score_data_withanno$seurat_annotations == "B"] <- "B"
roc_empirical <- rocit(score = score_data_withanno$B_cell_activation, class = label, negref = "others")
title <- paste0('AUC:', strsplit(as.character(summary(roc_empirical)[4,]), ':')[[1]][2])
Method used: empirical
Number of positive(s): 344
Number of negative(s): 2356
Area under curve: 0.9981
R version 4.4.2 (2024-10-31)
Platform: x86_64-apple-darwin20
Running under: macOS Sequoia 15.3
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] C/UTF-8/C/C/C/C
time zone: Asia/Shanghai
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods
[7] base
other attached packages:
[1] ROCit_2.1.2 stxBrain.SeuratData_0.1.2
[3] ifnb.SeuratData_3.1.0 SeuratData_0.2.2.9001
[5] Seurat_5.1.0 SeuratObject_5.0.2
[7] sp_2.1-4 msigdbr_7.5.1
[9] reshape2_1.4.4 ggrepel_0.9.6
[11] ggplot2_3.5.1 rmarkdown_2.28
[13] PaaSc_1.0.0
loaded via a namespace (and not attached):
[1] RcppAnnoy_0.0.22 splines_4.4.2
[3] later_1.3.2 R.oo_1.27.0
[5] tibble_3.2.1 polyclip_1.10-7
[7] fastDummies_1.7.4 lifecycle_1.0.4
[9] globals_0.16.3 lattice_0.22-6
[11] MASS_7.3-61 magrittr_2.0.3
[13] sass_0.4.9 plotly_4.10.4
[15] jquerylib_0.1.4 yaml_2.3.10
[17] httpuv_1.6.15 glmGamPoi_1.18.0
[19] sctransform_0.4.1 spam_2.11-0
[21] askpass_1.2.1 spatstat.sparse_3.1-0
[23] reticulate_1.39.0 cowplot_1.1.3
[25] pbapply_1.7-2 RColorBrewer_1.1-3
[27] abind_1.4-8 zlibbioc_1.52.0
[29] Rtsne_0.17 GenomicRanges_1.58.0
[31] R.utils_2.12.3 downlit_0.4.4
[33] purrr_1.0.2 mixtools_2.0.0
[35] BiocGenerics_0.52.0 rappdirs_0.3.3
[37] GenomeInfoDbData_1.2.13 IRanges_2.40.0
[39] S4Vectors_0.44.0 irlba_2.3.5.1
[41] listenv_0.9.1 spatstat.utils_3.1-0
[43] umap_0.2.10.0 goftest_1.2-3
[45] RSpectra_0.16-2 spatstat.random_3.3-2
[47] fitdistrplus_1.2-1 parallelly_1.38.0
[49] DelayedMatrixStats_1.28.0 leiden_0.4.3.1
[51] codetools_0.2-20 DelayedArray_0.32.0
[53] scuttle_1.16.0 tidyselect_1.2.1
[55] distill_1.6 UCSC.utils_1.2.0
[57] farver_2.1.2 ScaledMatrix_1.14.0
[59] viridis_0.6.5 matrixStats_1.4.1
[61] stats4_4.4.2 spatstat.explore_3.3-3
[63] jsonlite_1.8.9 BiocNeighbors_2.0.0
[65] progressr_0.15.0 ggridges_0.5.6
[67] survival_3.7-0 scater_1.34.0
[69] tictoc_1.2.1 segmented_2.1-3
[71] tools_4.4.2 ica_1.0-3
[73] Rcpp_1.0.13-1 glue_1.8.0
[75] gridExtra_2.3 SparseArray_1.6.0
[77] xfun_0.49 MatrixGenerics_1.18.0
[79] GenomeInfoDb_1.42.0 dplyr_1.1.4
[81] withr_3.0.2 fastmap_1.2.0
[83] fansi_1.0.6 openssl_2.2.2
[85] digest_0.6.37 rsvd_1.0.5
[87] R6_2.5.1 mime_0.12
[89] colorspace_2.1-1 scattermore_1.2
[91] tensor_1.5 spatstat.data_3.1-2
[93] R.methodsS3_1.8.2 utf8_1.2.4
[95] tidyr_1.3.1 generics_0.1.3
[97] data.table_1.16.2 httr_1.4.7
[99] htmlwidgets_1.6.4 S4Arrays_1.6.0
[101] uwot_0.2.2 pkgconfig_2.0.3
[103] gtable_0.3.6 lmtest_0.9-40
[105] SingleCellExperiment_1.28.0 XVector_0.46.0
[107] htmltools_0.5.8.1 bookdown_0.41
[109] dotCall64_1.2 fgsea_1.32.0
[111] scales_1.3.0 Biobase_2.66.0
[113] png_0.1-8 spatstat.univar_3.0-1
[115] rstudioapi_0.17.1 knitr_1.48
[117] nlme_3.1-166 cachem_1.1.0
[119] zoo_1.8-12 stringr_1.5.1
[121] KernSmooth_2.23-24 parallel_4.4.2
[123] miniUI_0.1.1.1 vipor_0.4.7
[125] pillar_1.9.0 grid_4.4.2
[127] vctrs_0.6.5 RANN_2.6.2
[129] promises_1.3.0 BiocSingular_1.22.0
[131] beachmat_2.22.0 xtable_1.8-4
[133] cluster_2.1.6 beeswarm_0.4.0
[135] CelliD_1.14.0 evaluate_1.0.1
[137] cli_3.6.3 compiler_4.4.2
[139] rlang_1.1.4 crayon_1.5.3
[141] future.apply_1.11.3 labeling_0.4.3
[143] RcppArmadillo_14.0.2-1 plyr_1.8.9
[145] ggbeeswarm_0.7.2 stringi_1.8.4
[147] viridisLite_0.4.2 deldir_2.0-4
[149] BiocParallel_1.40.0 babelgene_22.9
[151] munsell_0.5.1 lazyeval_0.2.2
[153] spatstat.geom_3.3-3 Matrix_1.7-1
[155] RcppHNSW_0.6.0 patchwork_1.3.0
[157] sparseMatrixStats_1.18.0 future_1.34.0
[159] shiny_1.9.1 highr_0.11
[161] SummarizedExperiment_1.36.0 kernlab_0.9-33
[163] ROCR_1.0-11 fontawesome_0.5.2
[165] memoise_2.0.1 igraph_2.1.1
[167] bslib_0.8.0 fastmatch_1.1-4